# generate 100 observations
I <- 100
mu <- 3.7
sigma <- 1.3
y <- rnorm(I, mu, sigma)
# empirical mean and SD
mean(y) ; sd(y)[1] 3.649769
[1] 1.350531
Bayesian Modeling with Stan using CmdStanR
\[ \begin{aligned} y_i &\sim \mathrm{Normal}(\mu, \sigma), \quad i \in 1:I \\ \mu &\sim p(.) \\ \sigma &\sim p(.) \\ \end{aligned} \]
We want to know \(p(\mu, \sigma \mid y)\)
Static typing: forces intent and catches errors sooner
functions {
// user-defined functions go here
}
data {
int<lower=0> I; // number of observations
vector[I] y; // outcome vector
}
transformed data {
real mean_y = mean(y);
real sd_y = sd(y);
print("mean_y: ", mean_y, ", ", "sd_y: ", sd_y);
}
parameters {
real mu; // mean
real<lower=0> sigma; // SD
}
transformed parameters {
}
model {
// priors
mu ~ normal(0, 5);
sigma ~ exponential(1);
// likelihood
y ~ normal(mu, sigma);
}
generated quantities {
real entropy = 0.5 * log(2 * pi() * e() * square(sigma));
}# load cmdstanr and set number of cores
library(cmdstanr)
options(mc.cores = 4)
# compile model
# mean_mod <- cmdstan_model(stan_file = "Stan/mean.stan")
# Stan data
mean_dat <- list(I = I, y = y)
# fit the model with NUTS
mean_fit <- mean_mod$sample(data = mean_dat,
iter_warmup = 200, iter_sampling = 200,
save_warmup = T, refresh = 0)Running MCMC with 4 parallel chains...
Chain 1 mean_y: 3.64977, sd_y: 1.35053
Chain 2 mean_y: 3.64977, sd_y: 1.35053
Chain 3 mean_y: 3.64977, sd_y: 1.35053
Chain 4 mean_y: 3.64977, sd_y: 1.35053
Chain 1 finished in 0.0 seconds.
Chain 2 finished in 0.0 seconds.
Chain 3 finished in 0.0 seconds.
Chain 4 finished in 0.0 seconds.
All 4 chains finished successfully.
Mean chain execution time: 0.0 seconds.
Total execution time: 0.2 seconds.
mod$sample() runs Stan’s HMC and yields a CmdStanMCMC objectCmdStanMCMC objects have associated methods
fit$draws() returns posterior draws for all quantities defined in parameters, transformed parameters, and generated quantitiesfit$summary() summarises these quantities (means, uncertainty, diagnostics, etc.)# A tibble: 4 × 6
variable median q5 q95 ess_bulk rhat
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 lp__ -81.6 -84.0 -80.9 497. 1.00
2 mu 3.65 3.42 3.85 636. 1.01
3 sigma 1.35 1.19 1.54 620. 1.00
4 entropy 1.72 1.60 1.85 620. 1.00
pacman::p_load(posterior, tidybayes, distributional, ggdist)
mean_fit |>
gather_rvars(mu, sigma) |>
mutate(.prior = c(dist_normal(0, 5),
dist_exponential(1))) |>
ggplot() +
facet_wrap(~ .variable,
ncol = 1,
scales = "free") +
stat_slab(aes(xdist = .prior),
alpha = 2/3) +
stat_slab(aes(xdist = .value),
alpha = 2/3,
fill = "red4") +
ggeasy::easy_remove_y_axis() +
coord_cartesian(expand = F) +
labs(x = "Prior and posterior distributions")\[ \begin{aligned} y_i &\sim \mathrm{Normal}(\mu_{[g_i]}, \sigma_{[g_i]}), \quad i \in 1:I, \ g \in 1:2 \\ \mu_1, \mu_2 &\sim p(.) \\ \sigma_1, \sigma_2 &\sim p(.) \\ \end{aligned} \]
data {
int<lower=0> I; // number of observations
vector[I] y; // outcome vector
array[I] int<lower=1, upper=2> g; // group identifier
}
parameters {
vector[2] mu; // means
vector<lower=0>[2] sigma; // SDs
}
model {
// priors
mu ~ normal(0, 5);
sigma ~ exponential(1);
// likelihood
y ~ normal(mu[g], sigma[g]);
}
generated quantities {
// difference in means and SDs
real delta_mu = mu[2] - mu[1];
real delta_sigma = sigma[2] - sigma[1];
}# compile model
# t_mod <- cmdstan_model(stan_file = "Stan/t.stan")
# fit the model with NUTS
t_fit <- t_mod$sample(data = list(I = I, y = y, g = g),
refresh = 0)Running MCMC with 4 parallel chains...
Chain 1 finished in 0.0 seconds.
Chain 2 finished in 0.0 seconds.
Chain 3 finished in 0.1 seconds.
Chain 4 finished in 0.1 seconds.
All 4 chains finished successfully.
Mean chain execution time: 0.0 seconds.
Total execution time: 0.2 seconds.
\[ \begin{aligned} y_i &\sim \mathrm{Normal}(\mu_i, \sigma), \quad i \in 1:I \\ \mu_i &= \alpha + \beta \cdot x_i \\ \alpha, \beta, \sigma &\sim p(.) \end{aligned} \]
Introducing log_lik and yrep
data {
int<lower=0> I; // number of observations
vector[I] y, x; // outcome vector and covariate
}
parameters {
real alpha, beta; // intercept and slope
real<lower=0> sigma; // SD
}
model {
// priors
alpha ~ normal(0, 5);
beta ~ normal(0, 1);
sigma ~ exponential(1);
// likelihood
// vector[I] mu = alpha + beta * x;
y ~ normal(alpha + beta * x, sigma);
// target += normal_lupdf(y | alpha + beta * x, sigma);
}
generated quantities {
// pointwise log-likelihood values
vector[I] log_lik;
for (i in 1:I) {
log_lik[i] = normal_lpdf(y[i] | alpha + beta * x[i], sigma);
}
// posterior predictive distribution
array[I] real yrep = normal_rng(alpha + beta * x, sigma);
}# compile model
# lm_mod <- cmdstan_model(stan_file = "Stan/lm.stan")
# Stan data
lm_dat <- list(I = I, y = y, x = x) |> glimpse()List of 3
$ I: num 100
$ y: num [1:100] 0.891 -2.458 -2.36 -2.023 -0.888 ...
$ x: num [1:100] 2.194 -2.367 0.726 -0.466 0.293 ...
Running MCMC with 4 parallel chains...
Chain 1 finished in 0.1 seconds.
Chain 2 finished in 0.1 seconds.
Chain 3 finished in 0.1 seconds.
Chain 4 finished in 0.1 seconds.
All 4 chains finished successfully.
Mean chain execution time: 0.1 seconds.
Total execution time: 0.2 seconds.
# check output
lm_fit$summary(c("alpha", "beta", "sigma")) |>
mutate(truth = c(alpha, beta, sigma)) |>
select(-sd, -mad)# A tibble: 3 × 9
variable mean median q5 q95 rhat ess_bulk ess_tail truth
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 alpha -1.07 -1.07 -1.19 -0.949 1.00 3541. 2424. -1
2 beta 0.576 0.576 0.445 0.706 1.00 3753. 2620. 0.5
3 sigma 0.740 0.737 0.653 0.835 1.00 3619. 3086. 0.8
pacman::p_load(tidybayes, ggdist)
x_pred <- seq(min(x), max(x), length.out = 200)
lm_fit |>
spread_draws(alpha, beta, sigma)# A tibble: 4,000 × 6
.chain .iteration .draw alpha beta sigma
<int> <int> <int> <dbl> <dbl> <dbl>
1 1 1 1 -1.02 0.608 0.755
2 1 2 2 -1.13 0.533 0.747
3 1 3 3 -1.07 0.546 0.683
4 1 4 4 -1.20 0.557 0.777
5 1 5 5 -1.14 0.629 0.692
6 1 6 6 -0.985 0.523 0.762
7 1 7 7 -0.936 0.505 0.727
8 1 8 8 -0.946 0.518 0.721
9 1 9 9 -0.991 0.547 0.742
10 1 10 10 -1.03 0.575 0.746
# ℹ 3,990 more rows
pacman::p_load(tidybayes, ggdist)
x_pred <- seq(min(x), max(x), length.out = 200)
lm_fit |>
spread_draws(alpha, beta, sigma) |>
crossing(x = x_pred)# A tibble: 800,000 × 7
.chain .iteration .draw alpha beta sigma x
<int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 -1.02 0.608 0.755 -2.37
2 1 1 1 -1.02 0.608 0.755 -2.34
3 1 1 1 -1.02 0.608 0.755 -2.32
4 1 1 1 -1.02 0.608 0.755 -2.30
5 1 1 1 -1.02 0.608 0.755 -2.28
6 1 1 1 -1.02 0.608 0.755 -2.25
7 1 1 1 -1.02 0.608 0.755 -2.23
8 1 1 1 -1.02 0.608 0.755 -2.21
9 1 1 1 -1.02 0.608 0.755 -2.18
10 1 1 1 -1.02 0.608 0.755 -2.16
# ℹ 799,990 more rows
4000 draws \(\times\) 200 input values = 800,000 rows
pacman::p_load(tidybayes, ggdist)
x_pred <- seq(min(x), max(x), length.out = 200)
lm_fit |>
spread_draws(alpha, beta, sigma) |>
crossing(x = x_pred) |>
mutate(mu = alpha + beta * x) |>
ggplot(aes(x, mu)) +
stat_lineribbon(.width = 0.9,
fill = "blue4",
alpha = 1/2) +
geom_point(aes(y = y),
data = lm_tbl,
alpha = 2/3)Important
Prior predictive checks simulate potential data from the joint prior distribution. Posterior predictive checks simulate potential data from the joint posterior distribution after learning parameters from data.
log_lik
Computed from 4000 by 100 log-likelihood matrix.
Estimate SE
elpd_loo -112.8 6.8
p_loo 2.9 0.5
looic 225.6 13.5
------
MCSE of elpd_loo is 0.0.
MCSE and ESS estimates assume MCMC draws (r_eff in [0.7, 1.0]).
All Pareto k estimates are good (k < 0.7).
See help('pareto-k-diagnostic') for details.
\[ \begin{aligned} y_i &\sim \mathrm{Bernoulli}(\psi_i), \quad i \in 1:I, \ g \in 1:G \\ \psi_i &= \operatorname{logit}^{-1} (\alpha_{[g_i]} + \beta \cdot x_i) \\ \alpha_{g}, \beta &\sim p(.) \end{aligned} \]
data {
int<lower=1> I, G; // number of observations and groups
array[I] int<lower=0, upper=1> y; // binary outcome array
vector[I] x; // continuous covariate
array[I] int<lower=1, upper=G> g; // group identifier
}
parameters {
vector[G] alpha; // group-level intercepts
real beta; // slope
}
model {
// priors
alpha ~ logistic(0, 1);
beta ~ normal(0, 1);
// likelihood
y ~ bernoulli_logit(alpha[g] + beta * x);
// y ~ bernoulli(inv_logit(alpha[g] + beta * x));
}
generated quantities {
vector[I] log_lik;
for (i in 1:I) {
log_lik[i] = bernoulli_logit_lpmf(y[i] | alpha[g[i]] + beta * x[i]);
}
array[I] int yrep = bernoulli_logit_rng(alpha[g] + beta * x);
}# compile model
# glm_mod <- cmdstan_model(stan_file = "Stan/glm.stan")
# Stan data
glm_dat <- list(I = I, G = G, y = y, x = x, g = g) |> glimpse()List of 5
$ I: num 100
$ G: num 4
$ y: int [1:100] 0 1 1 0 1 0 0 1 1 0 ...
$ x: num [1:100] 0.252 -1.797 0.225 0.322 1.324 ...
$ g: int [1:100] 3 1 1 2 1 4 3 2 4 2 ...
Running MCMC with 4 parallel chains...
Chain 1 finished in 0.1 seconds.
Chain 2 finished in 0.1 seconds.
Chain 3 finished in 0.1 seconds.
Chain 4 finished in 0.1 seconds.
All 4 chains finished successfully.
Mean chain execution time: 0.1 seconds.
Total execution time: 0.2 seconds.
::: fragment